The purpose of this document is to reproduce the pipeline used to score genes on the basis of their expression in different Plasmodium species and life cycle stages, with the goal of selecting liver stage candidates.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
library(ggplot2)
library(scales)
library(ggrepel)
library(ggpubr)
library(patchwork)
library(openxlsx)
source('~/R projects/scriptsCommon/colorworks.R')
RNA-seq data for different species and life cycle stages was retrieved from PlasmoDB as separate datasets. They need to be integrated before they can be used. For some datasets with multiple replicates, the expression of a gene is summarized as the mean or the geometric mean of all values.
geomean <- function(x){exp(mean(log(x)))}
pcyn100 <- read.delim('Data/PlasmoDB/Pcyn_100daysBloodRNAseq.txt')
pcyn100 <- pcyn100[,(!grepl('antisense', colnames(pcyn100)))]
pcyn100 <- pcyn100[,!(grepl('sense', colnames(pcyn100)) & ! grepl('unique.only', colnames(pcyn100)))]
pcyn100 %<>% select(-X)
pcyn100 <- pcyn100[,!(grepl('sense', colnames(pcyn100)) & !grepl('WB.+TP2', colnames(pcyn100)))] #TP2 corresponds to acute parasitemia, WB is for whole blood
colnames(pcyn100) <- gsub('(.+\\.\\.\\.WB_)(.+)(\\.TP2\\.\\.\\.unique.only)','\\2',colnames(pcyn100))
pcyn100 %<>% rename('P.cyn.Gene.ID'='Gene.ID')
head(pcyn100)
## P.cyn.Gene.ID source_id Gene.Name.or.Symbol gene_source_id
## 1 PcyM_0000100 PcyM_0000100-t36_1 N/A PcyM_0000100
## 2 PcyM_0000200 PcyM_0000200-t36_1 N/A PcyM_0000200
## 3 PcyM_0000300 PcyM_0000300-t36_1 N/A PcyM_0000300
## 4 PcyM_0000350 PcyM_0000350-t36_1 N/A PcyM_0000350
## 5 PcyM_0000400 PcyM_0000400-t36_1 N/A PcyM_0000400
## 6 PcyM_0000500 PcyM_0000500-t36_1 N/A PcyM_0000500
## Mmul_RIc14.Day21 Mmul_RSb14.Day21 Mmul_RMe14.Day20 Mmul_RFa14.Day18
## 1 2.19 0.32 1.13 0.54
## 2 0.00 0.00 0.07 0.06
## 3 11.17 14.62 13.65 12.63
## 4 0.44 0.26 0.40 0.40
## 5 8.68 7.66 9.44 11.45
## 6 7.77 9.07 8.41 9.29
pcyn_ort <- read.delim('Data/PlasmoDB/PcynPfal_SyntenicOrthologs.txt')
pcyn100 <- merge(pcyn100, pcyn_ort, by.x='P.cyn.Gene.ID', by.y='Input.Ortholog.s.', all.x=TRUE, all.y=FALSE)
pcyn100 %<>%
filter(!is.na(Gene.ID)) %>%
select(-c(source_id.x, source_id.y, X, Gene.Name.or.Symbol.x)) %>%
rename('Gene.Name.or.Symbol'='Gene.Name.or.Symbol.y')
pcyn100 %<>% rowwise() %>% mutate(PcynB.mean=mean(c_across(where(is.numeric))),
PcynB.max=max(c_across(where(is.numeric)))) %>%
ungroup()
pcyn100 %>% slice_sample(n=5)
## # A tibble: 5 × 11
## P.cyn.Gene.ID gene_source_id Mmul_RIc14.Day21 Mmul_RSb14.Day21
## <chr> <chr> <dbl> <dbl>
## 1 PcyM_0621300 PcyM_0621300 45.7 50.9
## 2 PcyM_1305300 PcyM_1305300 64.0 143.
## 3 PcyM_0410700 PcyM_0410700 333. 261.
## 4 PcyM_1445000 PcyM_1445000 159. 157.
## 5 PcyM_1443300 PcyM_1443300 46.5 49
## # ℹ 7 more variables: Mmul_RMe14.Day20 <dbl>, Mmul_RFa14.Day18 <dbl>,
## # Gene.ID <chr>, Organism <chr>, Gene.Name.or.Symbol <chr>, PcynB.mean <dbl>,
## # PcynB.max <dbl>
pcyn <- read.table('Data/PlasmoDB/Cubi_PcynNormalizedCounts.txt', sep='\t', header=TRUE)
colnames(pcyn)[5:7] <- c('P.cyn.Liver.Schizont','P.cyn.Blood','P.cyn.Hypnozoite')
pcyn %<>% select(-c(2,3,8))
pcyn %<>% rename(P.cyn.Gene.ID=Gene.ID, P.cyn.Symbol=Gene.Name.or.Symbol, PcynB.liverDataset=P.cyn.Blood)
pcyn %<>% select(c('P.cyn.Gene.ID','P.cyn.Liver.Schizont','PcynB.liverDataset')) %>%
right_join(pcyn_ort %>% select(Input.Ortholog.s., Gene.ID, Gene.Name.or.Symbol),
by=c('P.cyn.Gene.ID'='Input.Ortholog.s.')) #I'm keeping the data only for Pc genes that have syntenic orthologs in Pf
pcyn %<>% distinct(.keep_all = T)
pcyn %>% slice_sample(n=5)
## P.cyn.Gene.ID P.cyn.Liver.Schizont PcynB.liverDataset Gene.ID
## 1 PcyM_1454100 243.82 128.85 PF3D7_1231500
## 2 PcyM_0946100 78.13 63.77 PF3D7_1141400
## 3 PcyM_1019400 102.38 73.65 PF3D7_0513400
## 4 PcyM_0720100 139.51 57.03 PF3D7_0921300
## 5 PcyM_1270100 453.02 385.22 PF3D7_1445700
## Gene.Name.or.Symbol
## 1 N/A
## 2 PIGH
## 3 YihA1
## 4 N/A
## 5 N/A
pcyn %<>%
left_join(pcyn100[,c(1,3:6,10,11)],
by='P.cyn.Gene.ID')
## Warning in left_join(., pcyn100[, c(1, 3:6, 10, 11)], by = "P.cyn.Gene.ID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 168 of `x` matches multiple rows in `y`.
## ℹ Row 462 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
pcyn %>% slice_sample(n=5)
## P.cyn.Gene.ID P.cyn.Liver.Schizont PcynB.liverDataset
## 1 PcyM_1029800 15.67 288.27
## 2 PcyM_0213000 25.14 32.04
## 3 PcyM_0306400,PcyM_0306500 NA NA
## 4 PcyM_0734300 94.50 20.22
## 5 PcyM_1140000 112.70 69.64
## Gene.ID Gene.Name.or.Symbol Mmul_RIc14.Day21 Mmul_RSb14.Day21
## 1 PF3D7_0504300 N/A 42.08 44.52
## 2 PF3D7_0104700 MFR2 3.57 4.40
## 3 PF3D7_0404400 P36 NA NA
## 4 PF3D7_0932800 CSE1 137.12 136.47
## 5 PF3D7_0611400 SWIB 53.03 57.71
## Mmul_RMe14.Day20 Mmul_RFa14.Day18 PcynB.mean PcynB.max
## 1 64.33 44.17 48.7750 64.33
## 2 8.19 7.74 5.9750 8.19
## 3 NA NA NA NA
## 4 123.76 123.57 130.2300 137.12
## 5 51.77 54.06 54.1425 57.71
pcyn %>%
mutate_at(.vars=c('PcynB.liverDataset','PcynB.mean'), .funs=rank) %>%
ggplot(aes(x=PcynB.liverDataset, y=PcynB.mean))+
geom_density2d_filled(show.legend=FALSE)+
geom_point(size=0.05, color='white', alpha=0.2)+
stat_cor(method='spearman', color='white')+
scale_fill_viridis_d(option='mako')+
scale_x_continuous(expand=expansion(mult=0))+
scale_y_continuous(expand=expansion(mult=0))+
theme_classic()+theme(aspect.ratio=1)
offst <- 0.001
pcyn %<>% mutate(PcL=log2( (P.cyn.Liver.Schizont+offst)/(PcynB.mean+offst) ),
PcD=P.cyn.Liver.Schizont-PcynB.mean)
pcyn %>%
slice_sample(n=5)
## P.cyn.Gene.ID P.cyn.Liver.Schizont PcynB.liverDataset Gene.ID
## 1 PcyM_0414800 81.45 22.09 PF3D7_0208600
## 2 PcyM_1104800 2167.36 1441.75 PF3D7_1366500
## 3 PcyM_1425900 153.63 102.80 PF3D7_0715700
## 4 PcyM_0911100 4.95 23.75 PF3D7_1109100
## 5 PcyM_0941800 97.22 39.68 PF3D7_1136800
## Gene.Name.or.Symbol Mmul_RIc14.Day21 Mmul_RSb14.Day21 Mmul_RMe14.Day20
## 1 RRF1 23.43 25.68 19.11
## 2 NDK 268.81 263.05 382.45
## 3 N/A 22.29 25.76 23.79
## 4 N/A 113.96 122.07 78.96
## 5 N/A 14.52 16.87 15.64
## Mmul_RFa14.Day18 PcynB.mean PcynB.max PcL PcD
## 1 20.38 22.1500 25.68 1.878561 59.3000
## 2 234.95 287.3150 382.45 2.915229 1880.0450
## 3 17.41 22.3125 25.76 2.783481 131.3175
## 4 22.69 84.4200 122.07 -4.091810 -79.4700
## 5 16.02 15.7625 16.87 2.624680 81.4575
pcyn %>% ggplot(aes(x=PcL))+stat_density()+
geom_boxplot(mapping=aes(y=-1e-2), width=0.01)+
ggtitle(expr(italic('P. cynomolgi')))+
theme_classic()
## Warning: Removed 59 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 59 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
pviv <- read.delim('Data/PlasmoDB/Gural_PvivNormalizedCounts.txt', sep='\t', header=TRUE)
colnames(pviv)[c(4,5)] <- c('P.viv.Mixed1','P.viv.Mixed2')
pviv %<>% rename(P.viv.Gene.ID=Gene.ID) %>% select(-c(source_id, X))
pviv_ort <- read.delim('Data/PlasmoDB/PvivPfal_SyntenicOrthologs.txt', sep='\t', header=TRUE)
pviv_ort %<>% select(-c(source_id, X))
pviv_ort %<>% rename(P.viv.Gene.ID=Input.Ortholog.s.)
pviv %<>%
right_join(pviv_ort, by='P.viv.Gene.ID') %>%
distinct() %>%
distinct(.keep_all=TRUE)
# Summarize normalized counts for replicates as the mean
pviv %<>%
rowwise() %>%
mutate(Pviv.Lmean=mean(c(P.viv.Mixed1,P.viv.Mixed2))) %>%
ungroup()
pviv %>%
slice_sample(n=5)
## # A tibble: 5 × 9
## P.viv.Gene.ID gene_source_id.x P.viv.Mixed1 P.viv.Mixed2 Gene.ID
## <chr> <chr> <dbl> <dbl> <chr>
## 1 PVP01_1213600 PVP01_1213600 720. 507. PF3D7_1341200
## 2 PVP01_0213500 PVP01_0213500 83.1 63.2 PF3D7_0727800
## 3 PVP01_0725400 PVP01_0725400 44.0 46.6 PF3D7_0926900
## 4 PVP01_1434600 PVP01_1434600 1.25 2.48 PF3D7_1215800
## 5 PVP01_0935200 PVP01_0935200 83.0 78.9 PF3D7_1134400
## # ℹ 4 more variables: Product.Description <chr>, gene_source_id.y <chr>,
## # Gene.Name.or.Symbol <chr>, Pviv.Lmean <dbl>
#Pviv IED cycle ####
pviv.B <- read.delim('Data/PlasmoDB/Pviv_IEDCycle.txt') %>% select(-X)
colnames(pviv.B) <- gsub('(smru.+\\.\\.\\.)(smru.+)(\\.\\.\\..+)', '\\2', colnames(pviv.B))
pviv.B %<>% rename(P.viv.Gene.ID=Gene.ID)
# Get summary values for blood stage gene expression (max and mean)
pviv.B %<>% rowwise() %>% mutate(Pviv.Bmax=max(c_across(where(is.numeric))),
Pviv.Bmean=mean(c_across(where(is.numeric)))) %>%
ungroup()
pviv %<>%
left_join(pviv.B %>%
select(c('P.viv.Gene.ID', 'Pviv.Bmax', 'Pviv.Bmean')))
## Joining with `by = join_by(P.viv.Gene.ID)`
pviv %<>% rowwise() %>% mutate(PvL=log2((Pviv.Lmean+offst)/(Pviv.Bmax+offst)),
PvD=Pviv.Lmean-Pviv.Bmax) %>%
ungroup()
pviv %>% slice_sample(n=5)
## # A tibble: 5 × 13
## P.viv.Gene.ID gene_source_id.x P.viv.Mixed1 P.viv.Mixed2 Gene.ID
## <chr> <chr> <dbl> <dbl> <chr>
## 1 PVP01_1238400 PVP01_1238400 54.9 120. PF3D7_1468400
## 2 PVP01_0505700 PVP01_0505700 10.3 17.1 PF3D7_0828700
## 3 PVP01_0927200 PVP01_0927200 11.0 6.52 PF3D7_1126500
## 4 PVP01_1423400 PVP01_1423400 161. 141. PF3D7_0714300
## 5 PVP01_1308300 PVP01_1308300 202. 168. PF3D7_1209200
## # ℹ 8 more variables: Product.Description <chr>, gene_source_id.y <chr>,
## # Gene.Name.or.Symbol <chr>, Pviv.Lmean <dbl>, Pviv.Bmax <dbl>,
## # Pviv.Bmean <dbl>, PvL <dbl>, PvD <dbl>
pviv %>% ggplot(aes(x=PvL))+stat_density()+
geom_boxplot(mapping=aes(y=-1e-2), width=0.01)+
ggtitle(expr(italic('P. vivax')))+
theme_classic()
## Warning: Removed 60 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 60 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
sevenstg <- read.delim('Data/PlasmoDB/LopezBarragan_Pf7stagesNormalizedCounts.txt')
colnames(sevenstg)[6:12] <- gsub('(P..falciparum.Su.Seven.Stages.RNA.Seq.data...)(.+)(...unique..3D7.7Stages.RNA.Seq.)', '\\2', colnames(sevenstg[6:12]))
colnames(sevenstg) <- gsub(' ', '\\.', colnames(sevenstg))
sevenstg %<>% select(-c(source_id, Product.Description, gene_source_id, Ookinete, X))
sevenstg %<>% distinct(.keep_all=TRUE)
#Summarize gene expression in Pf blood stages as the max expression with exclusion of gametocytes
sevenstg %<>%
rowwise() %>%
mutate(seven.max=max(c_across(where(is.numeric))),
seven.max.excl.gam=max(Ring,Early.Trophozoite,Late.Trophozoite,Schizont))
sevenstg %>%
slice_sample(n=5)
## # A tibble: 5,460 × 10
## # Rowwise:
## Gene.ID Gene.Name.or.Symbol Ring Early.Trophozoite Late.Trophozoite Schizont
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 PF3D7_… VAR 32.2 4.01 3.8 1.62
## 2 PF3D7_… RIF 5.97 0.77 2.14 0
## 3 PF3D7_… VAR 2.13 0.15 0.06 0.16
## 4 PF3D7_… RIF 4.56 4.79 15.9 1.11
## 5 PF3D7_… N/A 1.52 0 2.23 0
## 6 PF3D7_… RIF 2.52 0.14 0.46 0
## 7 PF3D7_… N/A 0 0 0 0
## 8 PF3D7_… RIF 1.31 0 0 0
## 9 PF3D7_… RIF 0.48 2.18 0.36 0
## 10 PF3D7_… RIF 2.68 0 0.12 0
## # ℹ 5,450 more rows
## # ℹ 4 more variables: Gametocyte.II <dbl>, Gametocyte.V <dbl>, seven.max <dbl>,
## # seven.max.excl.gam <dbl>
idc.B <- read.xlsx('Data/PlasmoDB/Bartfai_PfIDCtimepoints.xlsx')
colnames(idc.B) <- gsub('(sense...)(.+)(...unique.+)', '\\2', colnames(idc.B))
# Summarize as mean, max, and cumulative expression for each gene
idc.B %<>% rowwise() %>% mutate(idcB.max=max(c_across(where(is.numeric))),
idcB.mean=mean(c_across(where(is.numeric))),
idcB.cumul=sum(c_across(starts_with('T')))
) %>%
ungroup()
idc.B %>%
slice_sample(n=5)
## # A tibble: 5 × 16
## Gene.ID source_id Product.Description gene_source_id Gene.Name.or.Symbol
## <chr> <chr> <chr> <chr> <chr>
## 1 PF3D7_1111500 PF3D7_11… acylphosphatase, p… PF3D7_1111500 N/A
## 2 PF3D7_0817500 PF3D7_08… histidine triad nu… PF3D7_0817500 HINT1
## 3 PF3D7_0710600 PF3D7_07… 60S ribosomal prot… PF3D7_0710600 RPL34
## 4 PF3D7_1435700 PF3D7_14… ataxin-2 like prot… PF3D7_1435700 N/A
## 5 PF3D7_0532600 PF3D7_05… Plasmodium exporte… PF3D7_0532600 N/A
## # ℹ 11 more variables: T40 <dbl>, T35 <dbl>, T25 <dbl>, T30 <dbl>, T15 <dbl>,
## # T20 <dbl>, T05 <dbl>, T10 <dbl>, idcB.max <dbl>, idcB.mean <dbl>,
## # idcB.cumul <dbl>
In order to compute the divergence between liver and blood stage expression within each model of infection, the expression values will be transformed into ranks.
pcyn %<>% ungroup() %>%
mutate(PcLiverRank=rank(P.cyn.Liver.Schizont),
PcBloodRank=rank(PcynB.mean))
pviv %<>% ungroup() %>%
mutate(PvLiverRank=rank(Pviv.Lmean),
PvBloodRank=rank(Pviv.Bmax))
LB.ranks <- inner_join(pcyn %>% select(Gene.ID, P.cyn.Gene.ID, PcLiverRank, PcBloodRank),
pviv %>% select(Gene.ID, P.viv.Gene.ID, PvLiverRank, PvBloodRank),
by='Gene.ID'
) %>%
mutate(PcRankDiff=abs(PcLiverRank-PcBloodRank),
PcLr=log2(PcLiverRank/PcBloodRank),
PvRankDiff=abs(PvLiverRank-PvBloodRank),
PvLr=log2(PvLiverRank/PvBloodRank))
## Warning in inner_join(pcyn %>% select(Gene.ID, P.cyn.Gene.ID, PcLiverRank, : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 236 of `x` matches multiple rows in `y`.
## ℹ Row 167 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
LB.ranks %>%
slice_sample(n=5)
## Gene.ID P.cyn.Gene.ID PcLiverRank PcBloodRank P.viv.Gene.ID PvLiverRank
## 1 PF3D7_0728400 PcyM_0217600 3735.5 2954.5 PVP01_0214100 1617.5
## 2 PF3D7_1231300 PcyM_1453900 28.0 1430.0 PVP01_1449500 399.0
## 3 PF3D7_1308500 PcyM_1410100 701.5 1263.0 PVP01_1409500 3075.0
## 4 PF3D7_1318100 PcyM_1419800 3919.0 2877.0 PVP01_1419000 3681.0
## 5 PF3D7_1135300 PcyM_0940100 2786.0 2580.0 PVP01_0936100 3297.0
## PvBloodRank PcRankDiff PcLr PvRankDiff PvLr
## 1 1834.5 781.0 0.3383874 217.0 -0.1816212
## 2 348.0 1402.0 -5.6744445 51.0 0.1973014
## 3 2602.5 561.5 -0.8483396 472.5 0.2406882
## 4 2775.0 1042.0 0.4459204 906.0 0.4076100
## 5 2154.0 206.0 0.1108242 1143.0 0.6141356
Visualize
p1 <- LB.ranks %>% ggplot(aes(x=PvBloodRank, y=PvLiverRank))+
geom_density2d_filled(show.legend = FALSE)+
geom_point(size=0.1, alpha=0.2, color='white')+
labs(x='Blood', y='Liver')+
ggtitle(expression(italic('P. vivax')))+
scale_fill_viridis_d(option='mako')+
scale_x_continuous(expand=expansion(mult=0))+
scale_y_continuous(expand=expansion(mult=0))+
theme_classic()+theme(aspect.ratio=1)
p2 <- LB.ranks %>% ggplot(aes(x=PcBloodRank, y=PcLiverRank))+
geom_density2d_filled(show.legend = FALSE)+
geom_point(size=0.1, alpha=0.2, color='white')+
labs(x='Blood', y='Liver')+
ggtitle(expression(italic('P. cynomolgi')))+
scale_fill_viridis_d(option='mako')+
scale_x_continuous(expand=expansion(mult=0))+
scale_y_continuous(expand=expansion(mult=0))+
theme_classic()+theme(aspect.ratio=1)
p1+p2+plot_annotation(title='Gene expression rank')
For the liver stage candidates, we want the genes with high expression at any point in the blood stage to score lower. Because we have multiple datasets describing the blood stage, we summarize blood expression as the maximum rank across datasets for each gene. This will be useful for liver stage candidates selection, but might need to be tweaked for blood candidates (because it may prioritize candidates with very high expression during a very limited section of the blood stage cycle, e.g. gametocytes, which are not very abundant in circulation and are conceivably not present for a long time).
idc.B %>%
select(Gene.ID, Gene.Name.or.Symbol, idcB.cumul) %>%
slice_sample(n=5)
## # A tibble: 5 × 3
## Gene.ID Gene.Name.or.Symbol idcB.cumul
## <chr> <chr> <dbl>
## 1 PF3D7_0920400 N/A 164.
## 2 PF3D7_0818400 FCF1 3622.
## 3 PF3D7_0903600 N/A 1191.
## 4 PF3D7_1202200 MPC 369.
## 5 PF3D7_1218600 RRS 567.
scoring <- LB.ranks %>% select(c(Gene.ID, PcLr, PcRankDiff, PvLr, PvRankDiff)) %>%
inner_join(Res %>% select(c(Gene.ID, R)),
by='Gene.ID') %>%
inner_join(idc.B %>% select(Gene.ID, Gene.Name.or.Symbol, idcB.cumul),
by='Gene.ID') %>%
rowwise() %>%
mutate(logCumul=log10(idcB.cumul))
## Warning in inner_join(., idc.B %>% select(Gene.ID, Gene.Name.or.Symbol, : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 147 of `x` matches multiple rows in `y`.
## ℹ Row 82 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
maxCumul = max(scoring$logCumul)
minCumul = min(scoring$logCumul)
scoring %<>% rowwise() %>%
mutate(mu=(logCumul-minCumul)/(maxCumul-minCumul))
scoring %<>% rowwise() %>%
mutate( F1= ( PcLr*abs(PcRankDiff) ) + ( PvLr*abs(PvRankDiff) ),
F3= 1/(abs(R))) %>%
mutate(F2=ifelse(F1>0, 1-mu, ifelse(F1<0, mu, 0))) %>%
mutate(Score=F1*F2*F3) %>%
distinct() %>%
ungroup()
scoring %>%
select(Gene.ID, Gene.Name.or.Symbol, F1, F2, F3, Score) %>%
arrange(-Score)
## # A tibble: 3,848 × 6
## Gene.ID Gene.Name.or.Symbol F1 F2 F3 Score
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 PF3D7_0602300 PALM 41934. 0.739 0.170 5277.
## 2 PF3D7_1418100 LISP1 48421. 0.861 0.0872 3635.
## 3 PF3D7_0615100 ENR 28416. 0.737 0.165 3449.
## 4 PF3D7_0211400 KASIII 37814. 0.682 0.125 3226.
## 5 PF3D7_1201700 N/A 40725. 0.549 0.127 2835.
## 6 PF3D7_1446400 pdhB 35725. 0.604 0.129 2792.
## 7 PF3D7_0922900 FabG 32765. 0.673 0.125 2758.
## 8 PF3D7_1246100 ALAS 29526. 0.653 0.129 2488.
## 9 PF3D7_1312000 MCAT 18860. 0.714 0.160 2160.
## 10 PF3D7_1368000 N/A 15757. 0.599 0.226 2134.
## # ℹ 3,838 more rows
scoring %<>%
rowwise() %>%
mutate(Gene.label=ifelse(Gene.Name.or.Symbol!='N/A', Gene.Name.or.Symbol, Gene.ID))
p1 <- scoring %>% left_join(LB.ranks, by='Gene.ID') %>%
ggplot(aes(x=PvBloodRank, y=PvLiverRank, color=Score))+
geom_point(alpha=0.7, size=0.3)+
labs(x='Blood', y='Liver')+
ggtitle(expression(italic('P. vivax')))+
scale_color_viridis_c(option='inferno')+
theme_classic()+theme(aspect.ratio=1)
p2 <- scoring %>% left_join(LB.ranks, by='Gene.ID') %>%
ggplot(aes(x=PcBloodRank, y=PcLiverRank, color=Score))+
geom_point(alpha=0.7, size=0.3)+
labs(x='Blood', y='Liver')+
ggtitle(expression(italic('P. cynomolgi')))+
scale_color_viridis_c(option='inferno')+
theme_classic()+theme(aspect.ratio=1)
(p1|p2)&plot_annotation(title='Rank of expression')
scoring %>%
left_join(idc.B, by='Gene.ID') %>%
select(c(Gene.ID, Score, starts_with('T'))) %>%
pivot_longer(cols=starts_with('T'),
names_to='Time',
values_to='Expression') %>%
mutate(Time=as.numeric(gsub('T', '', Time))) %>%
arrange(-Score) %>%
ggplot(aes(x=Time, y=Expression, color=Score,
group=Gene.ID))+
geom_line(mapping=aes(order=Score))+
scale_color_viridis_c(option='inferno')+
scale_y_log10(oob=oob_squish)+annotation_logticks(outside=TRUE, sides='l')+
scale_x_continuous(expand=expansion(mult=0))+
theme_classic()
## Warning in geom_line(mapping = aes(order = Score)): Ignoring unknown
## aesthetics: order
## Warning in scale_y_log10(oob = oob_squish): log-10 transformation introduced
## infinite values.
write.table(scoring,
file='GeneExpressionScores.txt',
sep='\t', quote=FALSE, row.names=FALSE)